{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import { parseHTML } from \"npm:linkedom\";\n",
    "import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
    "import * as sqlitePath from \"npm:sqlite-path\";\n",
    "import * as sqliteUrl from \"npm:sqlite-url\";\n",
    "import * as sqliteRegex from \"npm:sqlite-regex\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
    "\n",
    "const db = new Database(\":memory:\")\n",
    "db.enableLoadExtension = true;\n",
    "db.loadExtension(sqlitePath.getLoadablePath());\n",
    "db.loadExtension(sqliteUrl.getLoadablePath());\n",
    "db.loadExtension(sqliteRegex.getLoadablePath());\n",
    "db.enableLoadExtension = false;\n",
    "\n",
    "db.exec(`\n",
    "  CREATE TABLE articles(\n",
    "    slug_id TEXT,\n",
    "    slug TEXT,\n",
    "    headline TEXT,\n",
    "    url TEXT,\n",
    "    year integer,\n",
    "    month integer,\n",
    "    category1 TEXT,\n",
    "    category2 TEXT\n",
    "  )\n",
    "`)\n",
    "\n",
    "const stmt = db.prepare(`\n",
    "  insert into articles\n",
    "  select\n",
    "    regex_capture(\n",
    "      '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
    "      path_at(url_path(:url), -1),\n",
    "      'id'\n",
    "    ) as id,\n",
    "     regex_capture(\n",
    "      '(?P<slug>.+)-(?P<id>rcna\\\\d+)',\n",
    "      path_at(url_path(:url), -1),\n",
    "      'slug'\n",
    "    ) as slug,\n",
    "    :headline as headline,\n",
    "    :url as url,\n",
    "    :year as year,\n",
    "    :month as month,\n",
    "    path_at(url_path(:url), 0) as category1,\n",
    "    iif(\n",
    "      path_length(url_path(:url)) > 2,\n",
    "      path_at(url_path(:url), 1),\n",
    "      null\n",
    "    ) as category2\n",
    "`);\n",
    "\n",
    "const insertArticles = db.transaction((year, month, articles) => {\n",
    "  for(const article of articles) {\n",
    "    stmt.run({...article, year, month})\n",
    "  }\n",
    "})\n",
    "\n",
    "async function insertMonth(year:number, month: text) {\n",
    "  const monthPage = await fetch(`https://www.nbcnews.com/archive/articles/${year}/${month}`).then(r=>r.text())\n",
    "  const {document:monthPageDoc} = parseHTML(monthPage);\n",
    "  const monthEntries = monthPageDoc\n",
    "    .querySelectorAll('.MonthPage a')\n",
    "    .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
    "    insertArticles(year, months.findIndex(m => m === month)+1, monthEntries)\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "for(let year = 2014; year <= 2023; year++) {\n",
    "  for(const month of months) {\n",
    "    console.log(year, month);\n",
    "    await insertMonth(year, month);\n",
    "  }\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db.exec(\"vacuum into 'articles.db'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db.sql`select * from articles order by random() limit 10`"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Deno",
   "language": "typescript",
   "name": "deno"
  },
  "language_info": {
   "codemirror_mode": "typescript",
   "file_extension": ".ts",
   "mimetype": "text/x.typescript",
   "name": "typescript",
   "nbconvert_exporter": "script",
   "pygments_lexer": "typescript",
   "version": "5.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
